Lets load in the data, as well as check the 81 variables that we have.
train <- fread('train.csv', stringsAsFactors = TRUE)
train <- data.frame(train)
test <- fread('test.csv', stringsAsFactors = TRUE)
test <- data.frame(test)
names(train)## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
# apply class to each element by columns
#sapply(train, class)
str(train)## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
## $ Alley : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
## $ LotShape : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
## $ LandContour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Utilities : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ LandSlope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
## $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 16 12 22 15 18 4 ...
## $ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
## $ Condition2 : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
## $ Exterior2nd : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
## $ MasVnrType : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
## $ ExterCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
## $ BsmtQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
## $ BsmtCond : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
## $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ HeatingQC : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ CentralAir : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
## $ GarageType : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
## $ GarageCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ PavedDrive : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
## $ Fence : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
## $ MiscFeature : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : Factor w/ 9 levels "COD","CWD","Con",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
It is important to make it a practice to check for missing values. If a particular variable has too many missing values, we may have to remove it
#Num_NA<-sapply(train,function(x)length(which(is.na(x)==T)))
Num_NA <- apply(train, 2, function(x)length(which(is.na(x)==T)))
missing.df <- data.frame(colnames(train), Num_NA)
missing.df <- missing.df[order(missing.df$Num_NA, decreasing = TRUE),]
head(missing.df, 10)## colnames.train. Num_NA
## PoolQC PoolQC 1453
## MiscFeature MiscFeature 1406
## Alley Alley 1369
## Fence Fence 1179
## FireplaceQu FireplaceQu 690
## LotFrontage LotFrontage 259
## GarageType GarageType 81
## GarageYrBlt GarageYrBlt 81
## GarageFinish GarageFinish 81
## GarageQual GarageQual 81
We can see the top variables with NA. We have to remove them. Lets remove:
train$PoolQC <- NULL
train$MiscFeature <- NULL
train$Alley <- NULL
train$Fence <- NULL
test$PoolQC <- NULL
test$MiscFeature <- NULL
test$Alley <- NULL
test$Fence <- NULL# extract all numerical variables
all.numeric.variables.index <- sapply(train,is.numeric)
all.numeric.variables <- train[,all.numeric.variables.index]
# convert all factors into numericals variables
for(i in 1:length(colnames(train))){
if(is.factor(train[,i])){
train[,i] <- as.integer(train[,i])
}
}
# lets check
sapply(train, class)## Id MSSubClass MSZoning LotFrontage LotArea
## "integer" "integer" "integer" "integer" "integer"
## Street LotShape LandContour Utilities LotConfig
## "integer" "integer" "integer" "integer" "integer"
## LandSlope Neighborhood Condition1 Condition2 BldgType
## "integer" "integer" "integer" "integer" "integer"
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## "integer" "integer" "integer" "integer" "integer"
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## "integer" "integer" "integer" "integer" "integer"
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## "integer" "integer" "integer" "integer" "integer"
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## "integer" "integer" "integer" "integer" "integer"
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## "integer" "integer" "integer" "integer" "integer"
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## "integer" "integer" "integer" "integer" "integer"
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## "integer" "integer" "integer" "integer" "integer"
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## "integer" "integer" "integer" "integer" "integer"
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## "integer" "integer" "integer" "integer" "integer"
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## "integer" "integer" "integer" "integer" "integer"
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## "integer" "integer" "integer" "integer" "integer"
## PoolArea MiscVal MoSold YrSold SaleType
## "integer" "integer" "integer" "integer" "integer"
## SaleCondition SalePrice
## "integer" "integer"
# do the same for the test set so we can use it later on:
for(i in 1:length(colnames(test))){
if(is.factor(test[,i])){
test[,i] <- as.integer(test[,i])
}
}Now, we will need to fill all the remaining NA as 0
train[is.na(train)] <- 0
all.numeric.variables[is.na(all.numeric.variables)] <- 0
# do the same to test
test[is.na(test)] <- 0 Focus the initial analysis on the numeric variables (not the ones we turned from factors to numerical) Since we do not need the id column for the correlation, we can remove it
correlation.df <- cor(all.numeric.variables[,-1])
corrplot(correlation.df, method = "color", tl.cex = 0.5, type = "lower", sig.level = 0.05, insig = "blank", order = "hclust", addCoef.col = "black", number.cex = 0.35 ,diag = FALSE )Seems like we have some variables that have strong relationship with each other. The suspects are:
#pairs(~SalePrice+OverallQual+TotalBsmtSF+X1stFlrSF+GarageArea+GarageCars+GrLivArea , data = train)
featurePlot(x = train[,c("OverallQual","TotalBsmtSF","X1stFlrSF","GarageArea","GarageCars","GrLivArea")] ,
y = train[,"SalePrice"],
plot = "pairs",
# add key at the top
auto.key = list(columns = 3)
)Very obvious that multicollinearity exist in this variables.
plot_ly(all.numeric.variables, x = ~YearBuilt, y= ~SalePrice,type = 'scatter', mode = 'markers', name = "scatter") %>%
add_trace(y = fitted(lm(train$SalePrice ~ train$YearBuilt)), mode = 'lines', line = list(shape = "spline", smoothing = 1), name = 'smooth')#ggplot(train, aes(x= YearBuilt, y = SalePrice)) + geom_point() + geom_smooth()A general upward trend the older the house, the higher the price level of the house.
Lets do a partition using caret package to prep the train set, before we go into the modelling aspect.
set.seed(random)
outcome <- train$SalePrice
partition <- createDataPartition(y= outcome, p= 0.599, list =F)
# split train into 2: 6:4
training_split <- train[partition,]
testing_split <- train[-partition,]Most basic model will be the regression model. We start by fitting all the variables to have a look:
reg.model <- lm(SalePrice ~. , data = training_split)
summary(reg.model)##
## Call:
## lm(formula = SalePrice ~ ., data = training_split)
##
## Residuals:
## Min 1Q Median 3Q Max
## -328846 -14683 -755 13186 314877
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.104e+06 1.797e+06 2.284 0.022622 *
## Id -1.033e+00 2.756e+00 -0.375 0.707804
## MSSubClass -1.194e+02 6.388e+01 -1.869 0.061932 .
## MSZoning -1.068e+03 2.110e+03 -0.506 0.612991
## LotFrontage -5.290e+01 3.736e+01 -1.416 0.157213
## LotArea 1.714e-01 1.485e-01 1.154 0.248812
## Street 3.084e+04 1.647e+04 1.873 0.061491 .
## LotShape -1.338e+03 9.127e+02 -1.466 0.143056
## LandContour 3.756e+03 1.780e+03 2.110 0.035161 *
## Utilities -5.210e+04 3.518e+04 -1.481 0.139093
## LotConfig 1.250e+02 7.462e+02 0.168 0.867012
## LandSlope 1.283e+04 4.965e+03 2.584 0.009950 **
## Neighborhood 4.458e+02 2.082e+02 2.141 0.032543 *
## Condition1 -2.395e+02 1.246e+03 -0.192 0.847655
## Condition2 -8.349e+03 4.530e+03 -1.843 0.065720 .
## BldgType -1.552e+03 2.118e+03 -0.733 0.463792
## HouseStyle -2.085e+03 8.670e+02 -2.404 0.016431 *
## OverallQual 1.286e+04 1.589e+03 8.096 2.11e-15 ***
## OverallCond 4.395e+03 1.416e+03 3.103 0.001981 **
## YearBuilt 2.188e+02 1.029e+02 2.126 0.033801 *
## YearRemodAdd 6.795e+01 9.051e+01 0.751 0.453033
## RoofStyle 2.996e+03 1.550e+03 1.933 0.053629 .
## RoofMatl 1.987e+03 1.842e+03 1.078 0.281175
## Exterior1st 5.157e+02 7.125e+02 0.724 0.469441
## Exterior2nd -6.296e+02 6.454e+02 -0.976 0.329565
## MasVnrType 2.534e+03 2.036e+03 1.245 0.213457
## MasVnrArea 2.929e+01 8.607e+00 3.403 0.000700 ***
## ExterQual -4.693e+03 2.735e+03 -1.716 0.086540 .
## ExterCond -3.919e+02 1.648e+03 -0.238 0.812125
## Foundation 1.408e+02 2.309e+03 0.061 0.951416
## BsmtQual -7.762e+03 1.882e+03 -4.123 4.12e-05 ***
## BsmtCond 5.102e+03 1.778e+03 2.869 0.004226 **
## BsmtExposure -3.024e+03 1.197e+03 -2.526 0.011737 *
## BsmtFinType1 -9.580e+02 8.149e+02 -1.176 0.240098
## BsmtFinSF1 1.306e+00 7.168e+00 0.182 0.855425
## BsmtFinType2 3.522e+03 1.580e+03 2.229 0.026105 *
## BsmtFinSF2 1.626e+01 1.062e+01 1.532 0.125965
## BsmtUnfSF -1.974e+00 6.935e+00 -0.285 0.775999
## TotalBsmtSF NA NA NA NA
## Heating -6.193e+02 4.598e+03 -0.135 0.892902
## HeatingQC -8.036e+02 8.305e+02 -0.968 0.333493
## CentralAir -2.406e+02 6.153e+03 -0.039 0.968815
## Electrical -5.725e+02 1.181e+03 -0.485 0.628115
## X1stFlrSF 3.724e+01 8.376e+00 4.447 9.96e-06 ***
## X2ndFlrSF 4.730e+01 6.516e+00 7.258 9.27e-13 ***
## LowQualFinSF 2.529e+01 2.391e+01 1.058 0.290359
## GrLivArea NA NA NA NA
## BsmtFullBath 1.196e+04 3.268e+03 3.661 0.000268 ***
## BsmtHalfBath 3.955e+03 5.142e+03 0.769 0.442102
## FullBath 2.266e+03 3.535e+03 0.641 0.521664
## HalfBath -3.634e+03 3.454e+03 -1.052 0.293030
## BedroomAbvGr -1.373e+03 2.289e+03 -0.600 0.548854
## KitchenAbvGr -1.414e+04 7.317e+03 -1.932 0.053678 .
## KitchenQual -8.758e+03 1.958e+03 -4.472 8.86e-06 ***
## TotRmsAbvGrd 1.811e+03 1.618e+03 1.119 0.263438
## Functional 1.984e+03 1.460e+03 1.359 0.174437
## Fireplaces 6.698e+03 3.629e+03 1.846 0.065298 .
## FireplaceQu -8.139e+02 1.085e+03 -0.750 0.453408
## GarageType -5.351e+02 8.633e+02 -0.620 0.535529
## GarageYrBlt -2.520e+00 8.547e+00 -0.295 0.768178
## GarageFinish -1.827e+03 1.953e+03 -0.936 0.349686
## GarageCars 1.759e+04 3.987e+03 4.413 1.16e-05 ***
## GarageArea -3.241e+00 1.303e+01 -0.249 0.803639
## GarageQual -2.246e+03 2.287e+03 -0.982 0.326487
## GarageCond 1.208e+03 2.781e+03 0.434 0.664208
## PavedDrive 2.855e+03 2.843e+03 1.004 0.315676
## WoodDeckSF 1.192e+01 1.003e+01 1.189 0.234621
## OpenPorchSF -1.921e+01 1.962e+01 -0.979 0.327813
## EnclosedPorch 1.874e+01 2.230e+01 0.840 0.401072
## X3SsnPorch 3.852e+01 3.899e+01 0.988 0.323447
## ScreenPorch 6.634e+01 2.324e+01 2.855 0.004418 **
## PoolArea -3.896e+01 3.934e+01 -0.990 0.322278
## MiscVal -2.283e+00 3.716e+00 -0.614 0.539064
## MoSold 2.072e+02 4.397e+02 0.471 0.637564
## YrSold -2.322e+03 8.841e+02 -2.626 0.008803 **
## SaleType -9.708e+01 7.772e+02 -0.125 0.900625
## SaleCondition 2.109e+03 1.066e+03 1.979 0.048162 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32540 on 801 degrees of freedom
## Multiple R-squared: 0.8318, Adjusted R-squared: 0.8162
## F-statistic: 53.52 on 74 and 801 DF, p-value: < 2.2e-16
Very good R-square value, which says that the explainatory variables are explaining SalePrices well. At the same time, there are many variables which have no effect on SalePrices. We can now implement the stepwise algorithm for variable selection, based on AIC values.
reg.model.2 <- lm(formula = SalePrice ~ OverallQual + GrLivArea + YearBuilt +
KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea +
ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st +
BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea +
YrSold + ScreenPorch + Street + SaleCondition + GarageFinish +
BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd +
LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood +
KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure +
Condition1, data = training_split)
summary(reg.model.2)##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + YearBuilt +
## KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea +
## ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st +
## BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea +
## YrSold + ScreenPorch + Street + SaleCondition + GarageFinish +
## BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd +
## LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood +
## KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure +
## Condition1, data = training_split)
##
## Residuals:
## Min 1Q Median 3Q Max
## -352046 -15169 -1050 13586 328482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.116e+06 1.736e+06 2.371 0.017965 *
## OverallQual 1.333e+04 1.538e+03 8.666 < 2e-16 ***
## GrLivArea 4.197e+01 5.253e+00 7.989 4.49e-15 ***
## YearBuilt 2.076e+02 6.923e+01 2.999 0.002790 **
## KitchenQual -9.041e+03 1.884e+03 -4.798 1.89e-06 ***
## BsmtFullBath 1.189e+04 2.585e+03 4.601 4.85e-06 ***
## MSSubClass -1.896e+02 3.318e+01 -5.715 1.52e-08 ***
## GarageCars 1.463e+04 2.305e+03 6.349 3.55e-10 ***
## LotArea 2.425e-01 1.389e-01 1.746 0.081189 .
## ExterQual -5.598e+03 2.595e+03 -2.157 0.031304 *
## OverallCond 4.963e+03 1.186e+03 4.184 3.17e-05 ***
## PoolArea -5.150e+01 3.586e+01 -1.436 0.151375
## WoodDeckSF 1.707e+01 9.681e+00 1.763 0.078218 .
## Exterior1st 6.099e+02 6.933e+02 0.880 0.379278
## BsmtQual -7.792e+03 1.789e+03 -4.355 1.50e-05 ***
## BsmtCond 4.816e+03 1.667e+03 2.889 0.003967 **
## Fireplaces 4.474e+03 2.164e+03 2.068 0.038976 *
## MasVnrType 2.631e+03 1.954e+03 1.347 0.178465
## MasVnrArea 2.860e+01 8.065e+00 3.546 0.000413 ***
## YrSold -2.288e+03 8.583e+02 -2.666 0.007819 **
## ScreenPorch 6.559e+01 2.230e+01 2.941 0.003361 **
## Street 3.382e+04 1.598e+04 2.116 0.034623 *
## SaleCondition 2.037e+03 1.009e+03 2.019 0.043812 *
## GarageFinish -3.186e+03 1.441e+03 -2.211 0.027283 *
## BsmtFinType2 3.374e+03 1.455e+03 2.318 0.020690 *
## BsmtFinType1 -1.433e+03 7.154e+02 -2.004 0.045410 *
## BsmtFinSF2 1.235e+01 9.302e+00 1.327 0.184826
## TotRmsAbvGrd 1.319e+03 1.568e+03 0.841 0.400514
## LandSlope 1.138e+04 4.578e+03 2.485 0.013166 *
## EnclosedPorch 1.486e+01 2.147e+01 0.692 0.489055
## Functional 1.857e+03 1.413e+03 1.314 0.189233
## BedroomAbvGr -1.790e+03 2.163e+03 -0.827 0.408276
## Neighborhood 4.202e+02 1.990e+02 2.111 0.035064 *
## KitchenAbvGr -1.425e+04 6.819e+03 -2.090 0.036923 *
## X1stFlrSF -3.801e+00 4.516e+00 -0.842 0.400242
## FullBath 3.971e+03 3.106e+03 1.278 0.201460
## Exterior2nd -6.115e+02 6.308e+02 -0.969 0.332651
## BsmtExposure -2.752e+03 1.134e+03 -2.426 0.015457 *
## Condition1 -1.838e+02 1.190e+03 -0.154 0.877311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32620 on 837 degrees of freedom
## Multiple R-squared: 0.8233, Adjusted R-squared: 0.8153
## F-statistic: 102.7 on 38 and 837 DF, p-value: < 2.2e-16
Now, the model looks very good, and all the variables have an effect on SalePrice, Let us check the RMSE metric defined by kaggle.
We do this by doing a out of sample comparison: We obtain the prediction by using our model on the testing_split data set and compare with the actual test_split data value.
Some residual diagnosis: heteroskadesticity presence
We can plot residual vs fitted values to see if there is any pattern: if there is, heteroskadesticity is present.
In fact, we can use a code to check:
bptest(reg.model.2)##
## studentized Breusch-Pagan test
##
## data: reg.model.2
## BP = 471.76, df = 38, p-value < 2.2e-16
model <- ols(SalePrice ~ OverallQual + GrLivArea + YearBuilt +
KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea +
ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st +
BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea +
YrSold + ScreenPorch + Street + SaleCondition + GarageFinish +
BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd +
LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood +
KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure +
Condition1 + Id, data = training_split, x = TRUE)
reg.model.2.corrected <- robcov(model)
reg.model.2.corrected## Linear Regression Model
##
## ols(formula = SalePrice ~ OverallQual + GrLivArea + YearBuilt +
## KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea +
## ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st +
## BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea +
## YrSold + ScreenPorch + Street + SaleCondition + GarageFinish +
## BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd +
## LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood +
## KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure +
## Condition1 + Id, data = training_split, x = TRUE)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 876 LR chi2 1518.96 R2 0.823
## sigma32631.8551 d.f. 39 R2 adj 0.815
## d.f. 836 Pr(> chi2) 0.0000 g 75940.692
##
## Residuals
##
## Min 1Q Median 3Q Max
## -351535 -15200 -1117 13251 328446
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 4112810.6849 1648810.4804 2.49 0.0128
## OverallQual 13251.3602 2072.8919 6.39 <0.0001
## GrLivArea 41.9887 14.1715 2.96 0.0031
## YearBuilt 207.9856 92.3151 2.25 0.0245
## KitchenQual -9049.4636 2158.2764 -4.19 <0.0001
## BsmtFullBath 11842.4359 2222.5609 5.33 <0.0001
## MSSubClass -188.4168 40.0264 -4.71 <0.0001
## GarageCars 14694.3636 3951.3366 3.72 0.0002
## LotArea 0.2376 0.2295 1.04 0.3009
## ExterQual -5617.8979 3545.8978 -1.58 0.1135
## OverallCond 4965.8627 1207.0570 4.11 <0.0001
## PoolArea -50.6519 220.8765 -0.23 0.8187
## WoodDeckSF 16.8371 9.5924 1.76 0.0796
## Exterior1st 627.8258 935.6123 0.67 0.5024
## BsmtQual -7787.1083 1866.4897 -4.17 <0.0001
## BsmtCond 4803.0730 1334.5870 3.60 0.0003
## Fireplaces 4455.8031 2660.2048 1.67 0.0943
## MasVnrType 2678.2581 2972.2692 0.90 0.3678
## MasVnrArea 28.5386 15.3670 1.86 0.0636
## YrSold -2286.8082 790.0984 -2.89 0.0039
## ScreenPorch 65.5507 34.8310 1.88 0.0602
## Street 33937.8914 16371.8040 2.07 0.0385
## SaleCondition 2022.3028 1220.9625 1.66 0.0980
## GarageFinish -3225.6361 1387.3886 -2.32 0.0203
## BsmtFinType2 3416.7908 1123.5414 3.04 0.0024
## BsmtFinType1 -1418.2795 588.3831 -2.41 0.0161
## BsmtFinSF2 12.6422 7.4720 1.69 0.0910
## TotRmsAbvGrd 1324.9740 2363.9788 0.56 0.5753
## LandSlope 11357.3707 7591.7576 1.50 0.1350
## EnclosedPorch 14.7319 30.2178 0.49 0.6260
## Functional 1872.8096 1346.4032 1.39 0.1646
## BedroomAbvGr -1761.1074 2927.6072 -0.60 0.5476
## Neighborhood 418.9487 245.1648 1.71 0.0879
## KitchenAbvGr -14284.3589 5239.0428 -2.73 0.0065
## X1stFlrSF -3.6696 10.2509 -0.36 0.7204
## FullBath 3983.8102 4672.2094 0.85 0.3941
## Exterior2nd -640.4179 923.0366 -0.69 0.4880
## BsmtExposure -2775.5999 1203.2649 -2.31 0.0213
## Condition1 -203.1715 1257.2726 -0.16 0.8717
## Id -1.7063 2.2156 -0.77 0.4414
##
With the correction done (it does not affect the result since it only make changes to the t statistic values), we can know assess the model via the kaggle metric: RMSE on the log values
prediction.reg <- predict(reg.model.2.corrected, testing_split)
rmse.reg <- data.frame(log(prediction.reg), log(testing_split$SalePrice))
rmse(rmse.reg$log.prediction.reg.,rmse.reg$log.testing_split.SalePrice.)## [1] 0.1622399
LASSO regression can help reduce multicollinearity: Similar to the xgboost function, we need to ensure they are in matrix form.
covariate.variable <- as.matrix(training_split[,1:76]) # exclude y
dependent.variable <- as.matrix(training_split[,77])
testing.variable <- as.matrix(testing_split[,1:76])
lasso <- lars(covariate.variable, dependent.variable, type = 'lasso')
plot(lasso)Since we need the least multicolliearity, we can use marrow’s cp, and we obtain the least value from our lasso model we ran, and use this value in the prediction function
optimal.step <- lasso$df[which.min(lasso$Cp)]
prediction.lasso <- predict.lars(lasso, newx = testing.variable, s = optimal.step, type = "fit" )
rmse.lasso <- data.frame(test = log(testing_split$SalePrice), lasso = log(prediction.lasso$fit))
rmse(rmse.lasso$test,rmse.lasso$lasso)## [1] 0.1575195
Looking good with the improvement :)
rf <- randomForest(SalePrice ~. , data = training_split)
plot(rf)importance(rf)## IncNodePurity
## Id 1.954005e+10
## MSSubClass 1.298422e+10
## MSZoning 9.289012e+09
## LotFrontage 4.418033e+10
## LotArea 9.177708e+10
## Street 1.744684e+08
## LotShape 9.191824e+09
## LandContour 1.227413e+10
## Utilities 2.137381e+06
## LotConfig 6.753246e+09
## LandSlope 1.037402e+10
## Neighborhood 3.360471e+10
## Condition1 4.160271e+09
## Condition2 8.661295e+08
## BldgType 3.527341e+09
## HouseStyle 6.155201e+09
## OverallQual 1.107987e+12
## OverallCond 2.193869e+10
## YearBuilt 2.827881e+11
## YearRemodAdd 7.750877e+10
## RoofStyle 4.643791e+09
## RoofMatl 3.631695e+09
## Exterior1st 1.039458e+10
## Exterior2nd 8.763262e+09
## MasVnrType 5.215892e+09
## MasVnrArea 4.590954e+10
## ExterQual 4.246745e+11
## ExterCond 6.953708e+09
## Foundation 5.841040e+09
## BsmtQual 8.982764e+10
## BsmtCond 3.417276e+09
## BsmtExposure 7.853481e+09
## BsmtFinType1 8.545312e+09
## BsmtFinSF1 1.341033e+11
## BsmtFinType2 2.350457e+09
## BsmtFinSF2 2.780870e+09
## BsmtUnfSF 2.612220e+10
## TotalBsmtSF 2.250196e+11
## Heating 5.407842e+08
## HeatingQC 8.979582e+09
## CentralAir 6.473470e+09
## Electrical 9.518527e+08
## X1stFlrSF 1.433146e+11
## X2ndFlrSF 1.869299e+11
## LowQualFinSF 2.280316e+09
## GrLivArea 5.995296e+11
## BsmtFullBath 7.381454e+09
## BsmtHalfBath 1.539058e+09
## FullBath 5.269536e+10
## HalfBath 7.935952e+09
## BedroomAbvGr 1.464085e+10
## KitchenAbvGr 3.712873e+09
## KitchenQual 7.717333e+10
## TotRmsAbvGrd 2.922406e+10
## Functional 1.506942e+09
## Fireplaces 3.810044e+10
## FireplaceQu 4.382973e+10
## GarageType 1.442236e+10
## GarageYrBlt 7.914855e+10
## GarageFinish 1.706542e+10
## GarageCars 5.003178e+11
## GarageArea 2.098376e+11
## GarageQual 4.029363e+09
## GarageCond 6.794172e+09
## PavedDrive 1.911447e+09
## WoodDeckSF 2.073760e+10
## OpenPorchSF 3.419679e+10
## EnclosedPorch 4.193206e+09
## X3SsnPorch 1.239280e+09
## ScreenPorch 4.338597e+09
## PoolArea 4.139390e+10
## MiscVal 7.484207e+08
## MoSold 1.420899e+10
## YrSold 7.766156e+09
## SaleType 3.617993e+09
## SaleCondition 1.843741e+10
prediction.rf <- predict(rf, newdata = testing_split)
rmse.rf <- data.frame(test = log(testing_split$SalePrice), rf = log(prediction.rf) )
rmse(rmse.rf$test,rmse.rf$rf)## [1] 0.1522375
Very good prediction here!
One of the most powerful algorithm here. Let us try this out in this dataset!
For the data to be used in xgboost, wwe must ensure that they are in sparse matrix form.
training_split.sparse <- sparse.model.matrix(SalePrice ~. -1 , data = training_split[2:77]) # removing sales price as well as id
d_train <- xgb.DMatrix(data = training_split.sparse, label = training_split$SalePrice )
testing_split.sparse <- sparse.model.matrix(SalePrice~. -1, data = testing_split[2:77])
d_valid <- xgb.DMatrix(data = testing_split.sparse, label = testing_split$SalePrice)Using cross validation, we can obtain the optimal nrounds based on these parameters that we fixed. We define the early stopping round to be 50, and this meant that if there are no improvement for 50 boosting rounds, the iteration will stop. That is the optimal nrounds which we will use.
param.cv <- list(
seed = random,
objective="reg:linear",
booster="gbtree",
eta=0.1,
max.depth=8,
subsample=0.8,
colsample_bytree = 0.7,
num_parallel_tree = 1,
min_child_weight = 1
)
checking <- xgb.cv(data = d_train, params = param.cv, early_stopping_rounds = 50,
nrounds = 1000, verbose = 1, nfold = 5, metrics = "rmse", print_every_n = 50)## [1] train-rmse:177101.565625+887.275937 test-rmse:177134.181250+4174.168977
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
##
## [51] train-rmse:5894.306250+302.332955 test-rmse:30372.836719+4422.231596
## [101] train-rmse:1821.870605+195.657007 test-rmse:29936.927734+4490.337128
## [151] train-rmse:595.709705+116.521429 test-rmse:29903.619922+4506.269363
## [201] train-rmse:182.917391+42.345203 test-rmse:29890.121094+4515.339244
## [251] train-rmse:52.433025+9.974365 test-rmse:29885.723437+4519.550660
## [301] train-rmse:17.053361+4.297775 test-rmse:29885.227734+4519.455235
## [351] train-rmse:5.552435+1.476018 test-rmse:29885.065235+4519.469763
## Stopping. Best iteration:
## [319] train-rmse:11.283068+2.784754 test-rmse:29885.009766+4519.460684
param <- list(
seed = random,
objective="reg:linear",
booster="gbtree",
eta=0.1,
max.depth=8,
subsample=0.8,
colsample_bytree = 0.7,
num_parallel_tree = 1,
min_child_weight = 1,
eval_metric = "rmse"
)
watchlist = list(train = d_train, test = d_valid)
xgmodel <- xgb.train(params = param, data = d_train, nrounds = 319, watchlist = watchlist, verbose = 1, print_every_n = 50)## [1] train-rmse:177116.031250 test-rmse:182382.984375
## [51] train-rmse:6015.793457 test-rmse:28845.970703
## [101] train-rmse:2010.959961 test-rmse:27883.177734
## [151] train-rmse:714.633423 test-rmse:27845.777344
## [201] train-rmse:247.084396 test-rmse:27843.916016
## [251] train-rmse:82.004417 test-rmse:27840.111328
## [301] train-rmse:28.694693 test-rmse:27840.476562
## [319] train-rmse:20.059902 test-rmse:27840.406250
d_test <- xgb.DMatrix(data = testing_split.sparse)
xgb.predict <- predict(xgmodel, d_test)
rmse(log(testing_split$SalePrice),log(xgb.predict))## [1] 0.1367712
Usually after we are done with a xgboost modelling, it will be good to take look at the f scores for the feature importance:
feature.importance <- xgb.importance(model = xgmodel)
head(feature.importance,10)## Feature Gain Cover Frequency
## 1: 15 0.33531155 0.022654031 0.016126392
## 2: 44 0.16093956 0.053874109 0.030983955
## 3: 59 0.11719755 0.005054773 0.003233464
## 4: 25 0.08666229 0.002822118 0.002742305
## 5: 42 0.03418820 0.062727062 0.025171906
## 6: 32 0.03386903 0.050876949 0.042689915
## 7: 28 0.03135810 0.004794528 0.004256713
## 8: 36 0.02014810 0.039310844 0.031270465
## 9: 41 0.01986155 0.032183388 0.024926326
## 10: 18 0.01772089 0.026311366 0.027095612
xgb.plot.importance(importance_matrix = feature.importance)colnames(train)[15]## [1] "BldgType"
So feature 15 is deemed very important by xgboost,
Simply ensemble. More is better than 1.
(perhaps in future, I can do a comparison for all the different combination)
rmse.ensemble <- data.frame(test = log(testing_split$SalePrice), prediction = log(( prediction.rf + prediction.lasso$fit + xgb.predict) / 3))
rmse(rmse.ensemble$test, rmse.ensemble$prediction)## [1] 0.1340795
model.svm <- svm(SalePrice ~. , training_split, type = "nu-regression", gamma = 0.01, cost = 3)
prediction.svm <- predict(model.svm, testing_split)
rmse(log(testing_split$SalePrice), log(prediction.svm))## [1] 0.1724682
Let us retrain all the model on the entire training set, and have our test data set to fit on it. (I did not use regression in the end as it lower the overall)
reg.model.full <- lm(formula = SalePrice ~ OverallQual + GrLivArea+YearBuilt + KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea + ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st + BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea +
YrSold + ScreenPorch + Street + SaleCondition + GarageFinish +
BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd +
LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood +
KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure +
Condition1, data = train)
prediction.reg.full <- predict(reg.model.full,test)# create matrix of data sets
train.covariates <- as.matrix(train[,1:76])
train.outcome <- as.matrix(train[,77])
test.matrix <- as.matrix(test[,1:76])
lasso.full <- lars(train.covariates, train.outcome, type = "lasso")
# obtain minimize cp
best.s <- lasso.full$df[which.min(lasso.full$Cp)]
# predict
prediction.lasso.full <- predict.lars(lasso.full, newx = test.matrix, s = best.s, type = "fit")rf.full <- randomForest(SalePrice ~., data = train)
prediction.rf.full <- predict(rf.full, test)# loading in the full train and test set
training.sparse <- sparse.model.matrix(SalePrice ~. -1, data = train[2:77])
training <- xgb.DMatrix(training.sparse, label = train[,"SalePrice"])
testing <- xgb.DMatrix(data.matrix(test[,2:76]))
xgbmodel.full <- xgb.train(data = training, params = param, nrounds = 319, watchlist = list(train = training), verbose = 1, print_every_n = 50)## [1] train-rmse:178836.671875
## [51] train-rmse:6537.377930
## [101] train-rmse:2892.638916
## [151] train-rmse:1353.408569
## [201] train-rmse:640.623413
## [251] train-rmse:313.330017
## [301] train-rmse:156.633575
## [319] train-rmse:129.403961
xgb.predict.full <- predict(xgbmodel.full, testing)model.svm.full <- svm(SalePrice ~. , train, type = "nu-regression", gamma = 0.01, cost =3)
prediction.svm.full <- predict(model.svm.full, test)ensemble.price <- (xgb.predict.full + prediction.rf.full + prediction.lasso.full$fit) / 3
submission <- data.frame(Id = test$Id, SalePrice = ensemble.price)
write.csv(submission,'submission.csv', row.names = FALSE)